In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
Test 2
Input Features: x, x^2
Output / Target: y_noisy
Objective: How adding relevant features improves predicting accuracy
In [2]:
def quad_func (x):
return 5 * x ** 2 -23 * x + 47
In [3]:
# Training Set + Eval Set: 200 samples (70%, 30% split)
# Test Set: 60 samples
# Total: 260 samples
In [4]:
np.random.seed(5)
samples = 260
x_vals = pd.Series(np.random.rand(samples) * 20)
x2_vals = x_vals ** 2
y_vals = x_vals.map(quad_func)
y_noisy_vals = y_vals + np.random.randn(samples) * 50
In [5]:
df = pd.DataFrame({'x': x_vals,
'x2': x2_vals ,
'y': y_vals,
'y_noisy': y_noisy_vals})
In [6]:
df.head()
Out[6]:
In [7]:
df.corr()
Out[7]:
In [8]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df['x'],
y = df['y'],
color = 'r',
label = 'y',)
plt.scatter(x = df['x'],
y = df['y_noisy'],
color = 'b',
label = 'y noisy',
marker = '+')
plt.xlabel('x')
plt.ylabel('Target Attribute')
plt.grid(True)
plt.legend()
Out[8]:
In [9]:
data_path = '..\Data\RegressionExamples\quadratic'
In [10]:
df.to_csv(os.path.join(data_path,'quadratic_example_all.csv'),
index = True,
index_label = 'Row')
In [11]:
df[df.index < 200].to_csv(os.path.join(data_path, 'quadratic_example_train_underfit.csv'),
index = True,
index_label = 'Row',
columns = ['x', 'y_noisy'])
In [12]:
df[df.index < 200].to_csv(os.path.join(data_path, 'quadratic_example_train_normal.csv'),
index = True,
index_label = 'Row',
columns= ['x', 'x2', 'y_noisy'])
In [13]:
df.to_csv(os.path.join(data_path, 'quadratic_example_test_all_underfit.csv'),
index = True,
index_label = 'Row',
columns = ['x'])
In [14]:
df.to_csv(os.path.join(data_path, 'quadratic_example_test_all_normal.csv'),
index = True,
index_label = 'Row',
columns = ['x', 'x2'])
In [15]:
# Pull Predictions
# Prediction without quadratic term
df = pd.read_csv(os.path.join(data_path,'quadratic_example_all.csv'),
index_col = 'Row')
df_predicted_underfit = pd.read_csv(os.path.join(data_path, 'output_underfit',
'bp-pNYIAR35aSV-quadratic_example_test_all_underfit.csv.gz'))
df_predicted_underfit.columns = ["Row", "y_predicted"]
In [16]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.x,
y = df.y_noisy,
color = 'b',
label = 'actual',
marker = '+')
plt.scatter(x = df.x,
y = df_predicted_underfit.y_predicted ,
color = 'g',
label = 'Fit (x)',
marker = '^')
plt.title('Quadratic - underfit')
plt.xlabel('x')
plt.ylabel('Target Attribute')
plt.grid(True)
plt.legend()
Out[16]:
Test 1: Training RMSE: 385.18, Evaluation RMSE: 257.89, Baseline RMSE: 437.31 Wojciech results: Training RMSE: 385.16, Evaluation RMSE: 257.898, Baseline RMSE: 437.311
RMSE for the model is large and closer to baseline
In [17]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy, df_predicted_underfit.y_predicted],
labels = ['actual','predicted-underfit'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('y')
plt.grid(True)
In [18]:
df.y_noisy.describe()
Out[18]:
In [19]:
df_predicted_underfit.y_predicted.describe()
Out[19]:
In [20]:
df_predicted_normal = pd.read_csv(os.path.join(data_path,'output_normal',
'bp-In6EUvWaCw2-quadratic_example_test_all_normal.csv.gz'))
df_predicted_normal.columns = ["Row", "y_predicted"]
In [21]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.x,
y = df.y_noisy,
color = 'b',
label = 'actual',
marker ='+')
plt.scatter(x = df.x,
y = df_predicted_underfit.y_predicted,
color = 'g',
label = 'Fit (x)',
marker = '^')
plt.scatter(x = df.x ,
y = df_predicted_normal.y_predicted ,
color = 'r',
label = 'Fit (x,x^2)')
plt.title('Quadratic - normal fit')
plt.grid(True)
plt.xlabel('x')
plt.ylabel('Target Attribute')
#plt.legend()
Out[21]:
Test 1: Training RMSE: 385.16, Evaluation RMSE: 257.89, Baseline RMSE: 437.31
Test 2: Training RMSE: 132.20, Evaluation RMSE: 63.68, Baseline RMSE: 437.31
Test 2 RMSE is much better compared to baseline. Do note that we added approx -50 to 50 noise value to y
In [22]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df.y_noisy,df_predicted_underfit.y_predicted, df_predicted_normal.y_predicted],
labels = ['actual','predicted-underfit','predicted-normal'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('y')
plt.grid(True)
In [23]:
df_predicted_underfit.head()
Out[23]:
In [24]:
df_predicted_normal.head()
Out[24]: